<!DOCTYPE html>
<html lang="en">

<!-- Head tag -->
<head><meta name="generator" content="Hexo 3.8.0">

    <meta charset="utf-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1">

    <!--Description-->
    
        <meta name="description" content="欢迎！">
    

    <!--Author-->
    
        <meta name="author" content="fafa">
    

    <!--Open Graph Title-->
    
        <meta property="og:title" content="Run-Fa Zhang&#39;s Site">
    

    <!--Open Graph Description-->
    
        <meta property="og:description" content="欢迎！">
    

    <!--Open Graph Site Name-->
    <meta property="og:site_name" content="Run-Fa Zhang&#39;s Site">

    <!--Type page-->
    
        <meta property="og:type" content="website">
    

    <!--Page Cover-->
    

    <meta name="twitter:card" content="summary">
    

    <!-- Title -->
    
    <title>Run-Fa Zhang&#39;s Site</title>

    <!-- Bootstrap Core CSS -->
    <link rel="stylesheet" href="//maxcdn.bootstrapcdn.com/bootstrap/4.0.0-alpha.2/css/bootstrap.min.css" integrity="sha384-y3tfxAZXuh4HwSYylfB+J125MxIs6mR5FOHamPBG064zB+AFeWH94NdvaCBm8qnd" crossorigin="anonymous">

    <!-- Custom Fonts -->
    <link href="//maxcdn.bootstrapcdn.com/font-awesome/4.6.3/css/font-awesome.min.css" rel="stylesheet" type="text/css">

    <!-- HTML5 Shim and Respond.js IE8 support of HTML5 elements and media queries -->
    <!-- WARNING: Respond.js doesn't work if you view the page via file:// -->
    <!--[if lt IE 9]>
        <script src="//oss.maxcdn.com/libs/html5shiv/3.7.0/html5shiv.js"></script>
        <script src="//oss.maxcdn.com/libs/respond.js/1.4.2/respond.min.js"></script>
    <![endif]-->

    <!-- Gallery -->
    <link href="//cdnjs.cloudflare.com/ajax/libs/featherlight/1.3.5/featherlight.min.css" type="text/css" rel="stylesheet">

    <!-- Custom CSS -->
    <link rel="stylesheet" href="/css/style.css">

    <!-- Google Analytics -->
    


</head>


<body>

<div class="bg-gradient"></div>
<div class="bg-pattern"></div>

<!-- Menu -->
<!--Menu Links and Overlay-->
<div class="menu-bg">
    <div class="menu-container">
        <ul>
            
            <li class="menu-item">
                <a href="/">
                    Home
                </a>
            </li>
            
            <li class="menu-item">
                <a href="/archives">
                    Archives
                </a>
            </li>
            
            <li class="menu-item">
                <a href="/about.html">
                    About
                </a>
            </li>
            
            <li class="menu-item">
                <a href="/tags">
                    Tags
                </a>
            </li>
            
            <li class="menu-item">
                <a href="/categories">
                    Categories
                </a>
            </li>
            
            <li class="menu-item">
                <a href="/contact.html">
                    Contact
                </a>
            </li>
            
        </ul>
    </div>
</div>

<!--Hamburger Icon-->
<nav>
    <a href="#menu"></a>
</nav>

<div class="container">

    <!-- Main Content -->
    <div class="row">
    <div class="col-sm-12">

        <!--Title and Logo-->
        <header>
    <div class="logo">
        <a href="/"><i class="logo-icon fa fa-cube" aria-hidden="true"></i></a>
        
    </div>
</header>

        <section class="main">
            
<div class="post">

    <div class="post-header">
        <h1 class="title">
            <a href="/2019/05/20/user-17163699-1558319415/Python/腾讯广告算法大赛——统计日曝光量+数据清洗/">
                Untitled
            </a>
        </h1>
        <div class="post-info">
            
                <span class="date">2019-05-20</span>
            
            
            
        </div>
    </div>

    <div class="content">

        <!-- Gallery -->
        

        <!-- Post Content -->
        <!DOCTYPE html>
<html lang="en">
  <head><meta name="generator" content="Hexo 3.8.0">
    <meta charset="utf-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>腾讯广告算法大赛——统计日曝光量+数据清洗</title>
    <style type="text/css" media="all">
      body {
        margin: 0;
        font-family: "Helvetica Neue", Helvetica, Arial, "Hiragino Sans GB", sans-serif;
        font-size: 14px;
        line-height: 20px;
        color: #777;
        background-color: white;
      }
      .container {
        width: 700px;
        margin-right: auto;
        margin-left: auto;
      }

      .post {
        font-family: Georgia, "Times New Roman", Times, "SimSun", serif;
        position: relative;
        padding: 70px;
        bottom: 0;
        overflow-y: auto;
        font-size: 16px;
        font-weight: normal;
        line-height: 25px;
        color: #515151;
      }

      .post h1{
        font-size: 50px;
        font-weight: 500;
        line-height: 60px;
        margin-bottom: 40px;
        color: inherit;
      }

      .post p {
        margin: 0 0 35px 0;
      }

      .post img {
        border: 1px solid #D9D9D9;
      }

      .post a {
        color: #28A1C5;
      }
    </style>
  </head>
  <body>
    <div class="container">
      <div class="post">
        <h1 class="title">腾讯广告算法大赛——统计日曝光量+数据清洗</h1>
        <div class="show-content">
          <blockquote><p>################统计日曝光量##################<br>import pandas as pd<br>import numpy as nd<br>import time, datetime<br>#数据集如下<br>df1=pd.read_table('D:/testA/ad_static_feature.out',sep = '\t',header=None,engine='python')<br>df2=pd.read_table('D:/testA/totalExposureLog.out',sep = '\t',header=None,engine='python')<br>#增加 列名<br>df1.columns=['广告id','创建时间','广告账户id','商品id','商品类型','广告行业id','素材尺寸']<br>df2.columns=['广告请求id','广告请求时间','广告位id','用户id','广告id','曝光广告素材尺寸','曝光广告出价bid','曝光广告pctr','曝光广告quality_ecpm','曝光广告totalEcpm']<br>#拿出需要的特征<br>df1=df1[['广告id','创建时间','商品id', '商品类型', '广告行业id', '素材尺寸']]<br>df2=df2[['广告id','广告请求时间','曝光广告出价bid']]<br>#合并两个表<br>df3=pd.merge(df1,df2,on='广告id')<br>#转化日期<br>df3['year'] =df3['创建时间'].apply(lambda x: time.localtime(x).tm_year)<br>df3['month'] = df3['创建时间'].apply(lambda x: time.localtime(x).tm_mon)<br>df3['day'] =df3['创建时间'].apply(lambda x: time.localtime(x).tm_mday)<br>df3["创建日期"]=df3["year"].map(str)+df3["month"].map(str)+df3["day"].map(str)<br>df3['year'] =df3['广告请求时间'].apply(lambda x: time.localtime(x).tm_year)<br>df3['month'] = df3['广告请求时间'].apply(lambda x: time.localtime(x).tm_mon)<br>df3['day'] =df3['广告请求时间'].apply(lambda x: time.localtime(x).tm_mday)<br>df3["广告请求日期"]=df3["year"].map(str)+df3["month"].map(str)+df3["day"].map(str)<br>#去掉时间戳<br>df3=df3[['广告id','创建日期','广告请求日期','商品id', '商品类型', '广告行业id', '素材尺寸','曝光广告出价bid']]<br>#找出满足条件的数据（年月相同，曝光日为创建的第二日）<br>df=df3[df3.创建日期.map(int)+1==df3.广告请求日期.map(int)]<br>#更新一下index<br>df=df.reset_index(drop=True)<br>#统计相同ID和创建时间的数据 + 去重<br>df['次日曝光量']=df.groupby(['广告id','创建日期'])['广告id'].transform(len)<br>df=df.drop_duplicates()<br>#导出待清理脏数据的表<br>df.to_csv("D:/wating for washing.csv")</p></blockquote><hr><h1>数据清洗：</h1><blockquote>
<p>import pandas as pd<br>import numpy as nd<br>import time, datetime</p>
<p>#数据集如下<br>df1=pd.read_csv('D:/wating for washing.csv')</p>
<p>#去掉出价<br>df=df1[["广告id","创建日期","素材尺寸","广告行业id","商品类型","商品id","次日曝光量"]]</p>
<p>#查看基本信息<br>df.describe()</p>
<p>#仅仅查看行数<br>df.count()</p>
<p>#去重(除出价外 去重)<br>df=df.drop_duplicates()</p>
<p>#删掉空值所在行<br>df=df.dropna()</p>
<p>#仅仅查看行数<br>df.count()</p>
<p>####################################清理带有“,”的脏数据########################<br>df2=df.astype(str)#令所有内容变成字符串<br>#查看 广告id 中的异常数据<br>df2[df2["广告id"].str.contains(",")]#包含逗号的数据（无）<br>#查看 商品类型 中的异常数据<br>df2[df2["商品类型"].str.contains(",")]#包含逗号的数据（无）<br>#查看 素材尺寸 中的异常数据<br>df2[df2["素材尺寸"].str.contains(",")]#包含逗号的数据<br>#查看 商品id 中的异常数据<br>df2[df2["商品id"].str.contains(",")]#包含逗号的数据<br>#查看 广告行业id 中的异常数据<br>df2[df2["广告行业id"].str.contains(",")]#包含逗号的数据</p>
<p>###########清理广告行业id的脏数据<br>df3=df2[df2["广告行业id"].str.contains(",")]#包含逗号的数据<br>L1=list(df3.广告行业id)#包含逗号的数据做成一个列表list1<br>L2=list(df2.广告行业id)#全部数据做成一个列表list2<br>L3=list(set(L2)^set(L1))#列表求差集的方法：去掉脏数据的正常数据集合<br>df=df[df.广告行业id.isin(L3)]#isin()搜寻正常数据集合的最终结果<br>#直接方法   ~isin()搜寻不包含异常值的最终结果<br>#df4=df[~df.广告行业id.isin(L1)]#isin()搜寻正常数据集合的最终结果</p>
<p>###########清理素材尺寸的脏数据<br>df3=df2[df2["素材尺寸"].str.contains(",")]#包含逗号的数据<br>L=list(df3.素材尺寸)#包含逗号的数据做成一个列表list1<br>#包含逗号的数据做成一个列表list1<br>df=df[~df.素材尺寸.isin(L)]#~isin()搜寻不包含异常值的最终结果</p>
<p>###########清理商品id的脏数据<br>df3=df2[df2["商品id"].str.contains(",")]#包含逗号的数据<br>L=list(df3.商品id)#包含逗号的数据做成一个列表list1<br>#包含逗号的数据做成一个列表list1<br>df=df[~df.商品id.isin(L)]#~isin()搜寻不包含异常值的最终结果<br>#重置索引<br>df=df.reset_index(drop=True)<br>#导出数据<br>df.to_csv("D:/train.csv")</p>
</blockquote><hr><p><br></p><hr><p>If you are interested in this topic.<br>You can get in touch with me.<br>18234056952(Tel  wechat  qq)</p>
        </div>
      </div>
    </div>
  </body>
</html>

    </div>

    

    

    <!-- Comments -->
    

</div>
        </section>

    </div>
</div>


</div>

<!-- Footer -->
<div class="push"></div>

<footer class="footer-content">
    <div class="container">
        <div class="row">
            <div class="col-xs-12 col-sm-12 col-md-6 col-lg-6 footer-about">
                <h2>About</h2>
                <p>
                    This theme was developed by <a href="https://github.com/klugjo">Jonathan Klughertz</a>. The source code is available on Github. Create Websites. Make Magic.
                </p>
            </div>
            
    <div class="col-xs-6 col-sm-6 col-md-3 col-lg-3 recent-posts">
        <h2>Recent Posts</h2>
        <ul>
            
            <li>
                <a class="footer-post" href="/2019/05/20/user-17163699-1558319415/量子科学/量子算法/">Untitled</a>
            </li>
            
            <li>
                <a class="footer-post" href="/2019/05/20/user-17163699-1558319415/数学建模/建模/">Untitled</a>
            </li>
            
            <li>
                <a class="footer-post" href="/2019/05/20/user-17163699-1558319415/操作系统/Win10自带linux子系统/">Untitled</a>
            </li>
            
            <li>
                <a class="footer-post" href="/2019/05/20/user-17163699-1558319415/其他/Markdown/">Untitled</a>
            </li>
            
        </ul>
    </div>



            
        </div>
        <div class="row">
            <div class="col-xs-12 col-sm-12 col-md-12 col-lg-12">
                <ul class="list-inline footer-social-icons">
                    
                    <li class="list-inline-item">
                        <a href="https://github.com/klugjo/hexo-theme-alpha-dust">
                            <span class="footer-icon-container">
                                <i class="fa fa-github"></i>
                            </span>
                        </a>
                    </li>
                    
                    
                    <li class="list-inline-item">
                        <a href="https://twitter.com/?lang=en">
                            <span class="footer-icon-container">
                                <i class="fa fa-twitter"></i>
                            </span>
                        </a>
                    </li>
                    
                    
                    <li class="list-inline-item">
                        <a href="https://www.facebook.com/">
                            <span class="footer-icon-container">
                                <i class="fa fa-facebook"></i>
                            </span>
                        </a>
                    </li>
                    
                    
                    <li class="list-inline-item">
                        <a href="https://www.instagram.com/">
                            <span class="footer-icon-container">
                                <i class="fa fa-instagram"></i>
                            </span>
                        </a>
                    </li>
                    
                    
                    <li class="list-inline-item">
                        <a href="https://dribbble.com/">
                            <span class="footer-icon-container">
                                <i class="fa fa-dribbble"></i>
                            </span>
                        </a>
                    </li>
                    
                    
                    <li class="list-inline-item">
                        <a href="https://plus.google.com/">
                            <span class="footer-icon-container">
                                <i class="fa fa-google-plus"></i>
                            </span>
                        </a>
                    </li>
                    
                    
                    <li class="list-inline-item">
                        <a href="https://www.behance.net/">
                            <span class="footer-icon-container">
                                <i class="fa fa-behance"></i>
                            </span>
                        </a>
                    </li>
                    
                    
                    <li class="list-inline-item">
                        <a href="https://500px.com/">
                            <span class="footer-icon-container">
                                <i class="fa fa-500px"></i>
                            </span>
                        </a>
                    </li>
                    
                    
                    <li class="list-inline-item">
                        <a href="mailto:test@example.com">
                            <span class="footer-icon-container">
                                <i class="fa fa-envelope-o"></i>
                            </span>
                        </a>
                    </li>
                    
                    
                    <li class="list-inline-item">
                        <a href="\#">
                            <span class="footer-icon-container">
                                <i class="fa fa-rss"></i>
                            </span>
                        </a>
                    </li>
                    
                </ul>
            </div>
        </div>
        <div class="row">
            <div class="col-xs-12 col-sm-12 col-md-12 col-lg-12">
                <div class="footer-copyright">
                    @Untitled. All right reserved | Design & Hexo <a href="http://www.codeblocq.com/">Jonathan Klughertz</a>
                </div>
            </div>
        </div>
    </div>
</footer>

<!-- After footer scripts -->

<!-- jQuery -->
<script src="//code.jquery.com/jquery-2.1.4.min.js"></script>

<!-- Tween Max -->
<script src="//cdnjs.cloudflare.com/ajax/libs/gsap/1.18.5/TweenMax.min.js"></script>

<!-- Gallery -->
<script src="//cdnjs.cloudflare.com/ajax/libs/featherlight/1.3.5/featherlight.min.js" type="text/javascript" charset="utf-8"></script>

<!-- Custom JavaScript -->
<script src="/js/main.js"></script>

<!-- Disqus Comments -->



</body>

</html>